{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "e:\\python27\\lib\\site-packages\\ipykernel\\__main__.py:10: FutureWarning: Categorical.from_array is deprecated, use Categorical instead\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
       "            max_features=None, max_leaf_nodes=None,\n",
       "            min_impurity_split=1e-07, min_samples_leaf=1,\n",
       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "            presort=False, random_state=1, splitter='best')"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas\n",
    "\n",
    "# Set index_col to False to avoid pandas thinking that the first column is row indexes (it's age).\n",
    "columns = [\"age\", \"workclass\", \"fnlwgt\", \"education\", \"education_num\", \"marital_status\", \"occupation\", \"relationship\", \"race\", \"sex\", \n",
    "           \"capital_gain\", \"capital_loss\", \"hours_per_week\", \"native_country\", \"high_income\"]\n",
    "#income = pandas.read_csv(\"income.csv\", index_col=False,names=columns)\n",
    "income = pandas.read_csv(\"income.csv\", names=columns)\n",
    "\n",
    "for name in [\"education\", \"marital_status\", \"occupation\", \"relationship\", \"race\", \"sex\", \"native_country\", \"high_income\",\"workclass\"]:\n",
    "    col = pandas.Categorical.from_array(income[name])\n",
    "    income[name] = col.codes\n",
    "#print(income.head(5))\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "# A list of columns to train with.\n",
    "# All columns have been converted to numeric.\n",
    "columns = [\"age\", \"workclass\", \"education_num\", \"marital_status\", \"occupation\", \"relationship\", \"race\", \"sex\", \"hours_per_week\", \"native_country\"]\n",
    "\n",
    "# Instantiate the classifier.\n",
    "# Set random_state to 1 to keep results consistent.\n",
    "clf = DecisionTreeClassifier(random_state=1)\n",
    "\n",
    "# The variable income is loaded, and contains all the income data.\n",
    "clf.fit(income[columns], income[\"high_income\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy\n",
    "import math\n",
    "\n",
    "# Set a random seed so the shuffle is the same every time.\n",
    "numpy.random.seed(10)\n",
    "\n",
    "# Shuffle the rows.  This first permutes the index randomly using numpy.random.permutation.\n",
    "# Then, it reindexes the dataframe with this.\n",
    "# The net effect is to put the rows into random order.\n",
    "income = income.reindex(numpy.random.permutation(income.index))\n",
    "\n",
    "train_max_row = math.floor(income.shape[0] * .8)\n",
    "#print train_max_row\n",
    "train = income.iloc[:int(train_max_row)]\n",
    "test = income.iloc[int(train_max_row):]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.640594059406\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import roc_auc_score\n",
    "\n",
    "clf = DecisionTreeClassifier(random_state=1)\n",
    "clf.fit(train[columns], train[\"high_income\"])\n",
    "\n",
    "predictions = clf.predict(test[columns])\n",
    "error = roc_auc_score(test[\"high_income\"], predictions)\n",
    "print(error)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.991071428571\n"
     ]
    }
   ],
   "source": [
    "predictions = clf.predict(train[columns])\n",
    "print(roc_auc_score(train[\"high_income\"], predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.650495049505\n",
      "0.89794939715\n"
     ]
    }
   ],
   "source": [
    "# Decision trees model from the last screen.\n",
    "clf = DecisionTreeClassifier(random_state=10)\n",
    "clf = DecisionTreeClassifier(min_samples_split=5, random_state=10)\n",
    "clf.fit(train[columns], train[\"high_income\"])\n",
    "predictions = clf.predict(test[columns])\n",
    "test_auc = roc_auc_score(test[\"high_income\"], predictions)\n",
    "\n",
    "train_predictions = clf.predict(train[columns])\n",
    "train_auc = roc_auc_score(train[\"high_income\"], train_predictions)\n",
    "\n",
    "print(test_auc)\n",
    "print(train_auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.640594059406\n",
      "0.991071428571\n",
      "0.725346534653\n",
      "0.764625959079\n"
     ]
    }
   ],
   "source": [
    "# First decision trees model we trained and tested.\n",
    "clf = DecisionTreeClassifier(random_state=1)\n",
    "clf.fit(train[columns], train[\"high_income\"])\n",
    "predictions = clf.predict(test[columns])\n",
    "test_auc = roc_auc_score(test[\"high_income\"], predictions)\n",
    "\n",
    "train_predictions = clf.predict(train[columns])\n",
    "train_auc = roc_auc_score(train[\"high_income\"], train_predictions)\n",
    "\n",
    "print(test_auc)\n",
    "print(train_auc)\n",
    "clf = DecisionTreeClassifier(random_state=1, min_samples_split=13, max_depth=7)\n",
    "clf.fit(train[columns], train[\"high_income\"])\n",
    "predictions = clf.predict(test[columns])\n",
    "test_auc = roc_auc_score(test[\"high_income\"], predictions)\n",
    "\n",
    "train_predictions = clf.predict(train[columns])\n",
    "train_auc = roc_auc_score(train[\"high_income\"], train_predictions)\n",
    "\n",
    "print(test_auc)\n",
    "print(train_auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.650495049505\n",
      "0.995535714286\n"
     ]
    }
   ],
   "source": [
    "numpy.random.seed(10)\n",
    "\n",
    "# Generate a column with random numbers from 0 to 4.\n",
    "income[\"noise\"] = numpy.random.randint(4, size=income.shape[0])\n",
    "\n",
    "# Adjust columns to include the noise column.\n",
    "columns = [\"noise\", \"age\", \"workclass\", \"education_num\", \"marital_status\", \"occupation\", \"relationship\", \"race\", \"sex\", \"hours_per_week\", \"native_country\"]\n",
    "\n",
    "# Make new train and test sets.\n",
    "train_max_row = math.floor(income.shape[0] * .8)\n",
    "train = income.iloc[:int(train_max_row)]\n",
    "test = income.iloc[int(train_max_row):]\n",
    "\n",
    "# Initialize the classifier.\n",
    "clf = DecisionTreeClassifier(random_state=1)\n",
    "clf.fit(train[columns], train[\"high_income\"])\n",
    "predictions = clf.predict(test[columns])\n",
    "test_auc = roc_auc_score(test[\"high_income\"], predictions)\n",
    "\n",
    "train_predictions = clf.predict(train[columns])\n",
    "train_auc = roc_auc_score(train[\"high_income\"], train_predictions)\n",
    "\n",
    "print(test_auc)\n",
    "print(train_auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.635247524752\n",
      "0.715445544554\n"
     ]
    }
   ],
   "source": [
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import roc_auc_score\n",
    "\n",
    "columns = [\"age\", \"workclass\", \"education_num\", \"marital_status\", \"occupation\", \"relationship\", \"race\", \"sex\", \"hours_per_week\", \"native_country\"]\n",
    "\n",
    "clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)\n",
    "clf.fit(train[columns], train[\"high_income\"])\n",
    "\n",
    "clf2 = DecisionTreeClassifier(random_state=1, max_depth=5)\n",
    "clf2.fit(train[columns], train[\"high_income\"])\n",
    "predictions = clf.predict(test[columns])\n",
    "print(roc_auc_score(test[\"high_income\"], predictions))\n",
    "\n",
    "predictions = clf2.predict(test[columns])\n",
    "print(roc_auc_score(test[\"high_income\"], predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.660396039604\n"
     ]
    }
   ],
   "source": [
    "predictions = clf.predict_proba(test[columns])[:,1]\n",
    "predictions2 = clf2.predict_proba(test[columns])[:,1]\n",
    "combined = (predictions + predictions2) / 2\n",
    "rounded = numpy.round(combined)\n",
    "\n",
    "print(roc_auc_score(test[\"high_income\"], rounded))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.685346534653\n"
     ]
    }
   ],
   "source": [
    "# We'll build 10 trees\n",
    "tree_count = 10\n",
    "\n",
    "# Each \"bag\" will have 60% of the number of original rows.\n",
    "bag_proportion = .6\n",
    "\n",
    "predictions = []\n",
    "for i in range(tree_count):\n",
    "    # We select 60% of the rows from train, sampling with replacement.\n",
    "    # We set a random state to ensure we'll be able to replicate our results.\n",
    "    # We set it to i instead of a fixed value so we don't get the same sample every loop.\n",
    "    # That would make all of our trees the same.\n",
    "    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)\n",
    "    \n",
    "    # Fit a decision tree model to the \"bag\".\n",
    "    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)\n",
    "    clf.fit(bag[columns], bag[\"high_income\"])\n",
    "    \n",
    "    # Using the model, make predictions on the test data.\n",
    "    predictions.append(clf.predict_proba(test[columns])[:,1])\n",
    "combined = numpy.sum(predictions, axis=0) / 10\n",
    "rounded = numpy.round(combined)\n",
    "\n",
    "print(roc_auc_score(test[\"high_income\"], rounded))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.685346534653\n",
      "0.660198019802\n"
     ]
    }
   ],
   "source": [
    "# We'll build 10 trees\n",
    "tree_count = 10\n",
    "\n",
    "# Each \"bag\" will have 60% of the number of original rows.\n",
    "bag_proportion = .6\n",
    "\n",
    "predictions = []\n",
    "for i in range(tree_count):\n",
    "    # We select 60% of the rows from train, sampling with replacement.\n",
    "    # We set a random state to ensure we'll be able to replicate our results.\n",
    "    # We set it to i instead of a fixed value so we don't get the same sample every time.\n",
    "    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)\n",
    "    \n",
    "    # Fit a decision tree model to the \"bag\".\n",
    "    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)\n",
    "    clf.fit(bag[columns], bag[\"high_income\"])\n",
    "    \n",
    "    # Using the model, make predictions on the test data.\n",
    "    predictions.append(clf.predict_proba(test[columns])[:,1])\n",
    "\n",
    "combined = numpy.sum(predictions, axis=0) / 10\n",
    "rounded = numpy.round(combined)\n",
    "\n",
    "print(roc_auc_score(test[\"high_income\"], rounded))\n",
    "predictions = []\n",
    "for i in range(tree_count):\n",
    "    # We select 60% of the rows from train, sampling with replacement.\n",
    "    # We set a random state to ensure we'll be able to replicate our results.\n",
    "    # We set it to i instead of a fixed value so we don't get the same sample every time.\n",
    "    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)\n",
    "    \n",
    "    # Fit a decision tree model to the \"bag\".\n",
    "    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2, splitter=\"random\", max_features=\"auto\")\n",
    "    clf.fit(bag[columns], bag[\"high_income\"])\n",
    "    \n",
    "    # Using the model, make predictions on the test data.\n",
    "    predictions.append(clf.predict_proba(test[columns])[:,1])\n",
    "\n",
    "combined = numpy.sum(predictions, axis=0) / 10\n",
    "rounded = numpy.round(combined)\n",
    "\n",
    "print(roc_auc_score(test[\"high_income\"], rounded))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.675445544554\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "clf = RandomForestClassifier(n_estimators=5, random_state=1, min_samples_leaf=2)\n",
    "clf.fit(train[columns], train[\"high_income\"])\n",
    "\n",
    "predictions = clf.predict(test[columns])\n",
    "print(roc_auc_score(test[\"high_income\"], predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.675445544554\n",
      "0.685346534653\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "clf = RandomForestClassifier(n_estimators=5, random_state=1, min_samples_leaf=2)\n",
    "\n",
    "clf.fit(train[columns], train[\"high_income\"])\n",
    "\n",
    "predictions = clf.predict(test[columns])\n",
    "print(roc_auc_score(test[\"high_income\"], predictions))\n",
    "clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=2)\n",
    "\n",
    "clf.fit(train[columns], train[\"high_income\"])\n",
    "\n",
    "predictions = clf.predict(test[columns])\n",
    "print(roc_auc_score(test[\"high_income\"], predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.650297029703\n",
      "0.685148514851\n"
     ]
    }
   ],
   "source": [
    "clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=5)\n",
    "\n",
    "clf.fit(train[columns], train[\"high_income\"])\n",
    "\n",
    "predictions = clf.predict(train[columns])\n",
    "\n",
    "\n",
    "predictions = clf.predict(test[columns])\n",
    "print(roc_auc_score(test[\"high_income\"], predictions))\n",
    "\n",
    "clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=5)\n",
    "clf.fit(train[columns], train[\"high_income\"])\n",
    "\n",
    "predictions = clf.predict(train[columns])\n",
    "\n",
    "\n",
    "predictions = clf.predict(test[columns])\n",
    "print(roc_auc_score(test[\"high_income\"], predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
